import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
import scipy.stats as stats
from sklearn import metrics
from sklearn import tree
from sklearn.ensemble import BaggingRegressor,RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# To tune model, get different metric scores and split data
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
from sklearn.preprocessing import FunctionTransformer
# To build a logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
# To use statistical functions
import scipy.stats as stats
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To impute missing values
from sklearn.impute import KNNImputer
# To build a logistic regression model
from sklearn.linear_model import LogisticRegression
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
bank_data = pd.read_csv("BankChurners.csv")
df = bank_data.copy()
df.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | ... | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | ... | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | ... | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | ... | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | ... | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
5 rows × 21 columns
df.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | ... | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | ... | 2 | 3 | 4003.0 | 1851 | 2152.0 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | ... | 2 | 3 | 4277.0 | 2186 | 2091.0 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | ... | 3 | 4 | 5409.0 | 0 | 5409.0 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | ... | 3 | 3 | 5281.0 | 0 | 5281.0 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | ... | 2 | 4 | 10388.0 | 1961 | 8427.0 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
5 rows × 21 columns
df[df.duplicated()].count()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
Since we have already checked that all values are unique to a customer, CLIENTNUM is useless to us statistically, we will drop it.
df.drop('CLIENTNUM',axis=1,inplace=True)
#Checking that it was dropped
df.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.0 | 777 | 11914.0 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.0 | 864 | 7392.0 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.0 | 0 | 3418.0 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.0 | 2517 | 796.0 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.0 | 0 | 4716.0 | 2.175 | 816 | 28 | 2.500 | 0.000 |
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns.") # f-string
There are 10127 rows and 20 columns.
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null object 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null object 5 Marital_Status 9378 non-null object 6 Income_Category 10127 non-null object 7 Card_Category 10127 non-null object 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(9), object(6) memory usage: 1.5+ MB
categorical_var = ['Attrition_Flag','Gender','Education_Level','Marital_Status','Income_Category','Card_Category']
for colname in categorical_var:
df[colname] = df[colname].astype('category')
#checking if properly changed
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Attrition_Flag 10127 non-null category 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null category 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null category 5 Marital_Status 9378 non-null category 6 Income_Category 10127 non-null category 7 Card_Category 10127 non-null category 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: category(6), float64(5), int64(9) memory usage: 1.1 MB
df.isnull().sum()
Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
# looking at value counts for non-numeric features
num_to_display = 10 # defining this up here so it's easy to change later if I want
for colname in df.dtypes[df.dtypes == 'category'].index:
val_counts = df[colname].value_counts(dropna=False) # i want to see NA counts
print(val_counts[:num_to_display])
if len(val_counts) > num_to_display:
print(f'Only displaying first {num_to_display} of {len(val_counts)} values.')
print('\n\n') # just for more space between
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 F 5358 M 4769 Name: Gender, dtype: int64 Graduate 3128 High School 2013 NaN 1519 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 Married 4687 Single 3943 NaN 749 Divorced 748 Name: Marital_Status, dtype: int64 Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64
df['Income_Category'].replace('abc','Less than $40K',inplace = True )
df['Income_Category'].value_counts()
Less than $40K 4673 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64
df.median()
Customer_Age 46.000 Dependent_count 2.000 Months_on_book 36.000 Total_Relationship_Count 4.000 Months_Inactive_12_mon 2.000 Contacts_Count_12_mon 2.000 Credit_Limit 4549.000 Total_Revolving_Bal 1276.000 Avg_Open_To_Buy 3474.000 Total_Amt_Chng_Q4_Q1 0.736 Total_Trans_Amt 3899.000 Total_Trans_Ct 67.000 Total_Ct_Chng_Q4_Q1 0.702 Avg_Utilization_Ratio 0.176 dtype: float64
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Customer_Age | 10127.0 | 46.325960 | 8.016814 | 26.0 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.0 | 2.346203 | 1.298908 | 0.0 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.0 | 35.928409 | 7.986416 | 13.0 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.0 | 3.812580 | 1.554408 | 1.0 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.0 | 2.341167 | 1.010622 | 0.0 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.0 | 2.455317 | 1.106225 | 0.0 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.0 | 8631.953698 | 9088.776650 | 1438.3 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.0 | 1162.814061 | 814.987335 | 0.0 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.0 | 7469.139637 | 9090.685324 | 3.0 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.0 | 0.759941 | 0.219207 | 0.0 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.0 | 4404.086304 | 3397.129254 | 510.0 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.0 | 64.858695 | 23.472570 | 10.0 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.0 | 0.712222 | 0.238086 | 0.0 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.0 | 0.274894 | 0.275691 | 0.0 | 0.023 | 0.176 | 0.503 | 0.999 |
df.describe(include=["category"])
| Attrition_Flag | Gender | Education_Level | Marital_Status | Income_Category | Card_Category | |
|---|---|---|---|---|---|---|
| count | 10127 | 10127 | 8608 | 9378 | 10127 | 10127 |
| unique | 2 | 2 | 6 | 3 | 5 | 4 |
| top | Existing Customer | F | Graduate | Married | Less than $40K | Blue |
| freq | 8500 | 5358 | 3128 | 4687 | 4673 | 9436 |
def histogram_boxplot(data, feature, figsize=(15, 10), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (15,10))
kde: whether to show the density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 2, 6))
else:
plt.figure(figsize=(n + 2, 6))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
labeled_barplot(df, "Attrition_Flag", perc=True)
histogram_boxplot(df,"Customer_Age")
labeled_barplot(df, "Gender", perc=True)
histogram_boxplot(df,"Dependent_count")
labeled_barplot(df, "Education_Level", perc=True)
labeled_barplot(df, "Marital_Status", perc=True)
labeled_barplot(df, "Income_Category", perc=True)
labeled_barplot(df, "Card_Category", perc=True)
histogram_boxplot(df,"Months_on_book")
histogram_boxplot(df,"Total_Relationship_Count")
histogram_boxplot(df,"Months_Inactive_12_mon")
histogram_boxplot(df,"Contacts_Count_12_mon")
histogram_boxplot(df,"Credit_Limit")
histogram_boxplot(df,"Total_Revolving_Bal")
histogram_boxplot(df,"Avg_Open_To_Buy")
histogram_boxplot(df,"Total_Trans_Amt")
histogram_boxplot(df,"Total_Trans_Ct")
histogram_boxplot(df,"Total_Amt_Chng_Q4_Q1")
histogram_boxplot(df,"Total_Ct_Chng_Q4_Q1")
histogram_boxplot(df,"Avg_Utilization_Ratio")
plt.figure(figsize=(15, 7))
sns.heatmap(df.corr(), annot=True, vmin=-1, vmax=1, cmap="Spectral")
plt.show()
sns.pairplot(data=df, hue="Attrition_Flag")
plt.show()
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 5, 6))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
stacked_barplot(df,"Gender","Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df,"Education_Level","Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Education_Level All 1371 7237 8608 Graduate 487 2641 3128 High School 306 1707 2013 Uneducated 237 1250 1487 College 154 859 1013 Doctorate 95 356 451 Post-Graduate 92 424 516 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df,"Marital_Status","Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Marital_Status All 1498 7880 9378 Married 709 3978 4687 Single 668 3275 3943 Divorced 121 627 748 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df,"Income_Category","Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Income_Category All 1627 8500 10127 Less than $40K 799 3874 4673 $40K - $60K 271 1519 1790 $80K - $120K 242 1293 1535 $60K - $80K 189 1213 1402 $120K + 126 601 727 ------------------------------------------------------------------------------------------------------------------------
stacked_barplot(df,"Card_Category","Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Card_Category All 1627 8500 10127 Blue 1519 7917 9436 Silver 82 473 555 Gold 21 95 116 Platinum 5 15 20 ------------------------------------------------------------------------------------------------------------------------
cols = df[
[
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio"
]
].columns.tolist()
plt.figure(figsize=(15, 20))
for i, variable in enumerate(cols):
plt.subplot(4, 4, i + 1)
sns.boxplot(df["Attrition_Flag"], df[variable])
plt.tight_layout()
plt.title(variable)
plt.show()
Data Description:
Data Cleaning:
Observations from EDA
Attrition_Flag:
Customer_Age:
Gender:
Dependent_count:
Education_Level:
Marital_Status:
Income_Category:
Card_Category:
Months_on_book:
Total_Relationship_Count:
Months_Inactive_12_mon:
Contacts_Count_12_mon:
Credit_Limit:
Total_Revolving_Bal:
Avg_Open_To_Buy:
Total_Amt_Chng_Q4_Q1:
Total_Trans_Amt:
Total_Trans_Ct:
Total_Ct_Chng_Q4_Q1:
Avg_Utilization_Ratio:
`Attrition_Flag vs Other Variables
def IQR_detect(data, colname):
quartiles = np.quantile(data[colname][data[colname].notnull()], [.25, .75])
colname_4iqr = 4 * (quartiles[1] - quartiles[0])
colname_outliers = df.loc[np.abs(data[colname] - data[colname].median()) > colname_4iqr, colname]
return colname_outliers
outliercols = ["Months_Inactive_12_mon","Contacts_Count_12_mon","Credit_Limit","Avg_Open_To_Buy","Total_Trans_Amt",
"Total_Amt_Chng_Q4_Q1","Total_Ct_Chng_Q4_Q1"]
for name in outliercols:
name_outliers = IQR_detect(df,name)
df.drop(name_outliers.index, axis=0, inplace=True)
check = IQR_detect(df, "Total_Trans_Amt")
check
9140 14132
9147 14213
9159 13945
9162 13820
9180 14084
...
9940 14043
9974 13969
9978 14209
9989 13986
10088 13940
Name: Total_Trans_Amt, Length: 103, dtype: int64
X = df.drop("Attrition_Flag", axis=1)
X = pd.get_dummies(X)
y = df["Attrition_Flag"]
y = y.map({'Existing Customer':1 ,'Attrited Customer':0})
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(5724, 34) (1908, 34) (1908, 34)
imputer = SimpleImputer(strategy="median")
impute = imputer.fit(X_train)
X_train = impute.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{
"Accuracy": acc,
"Recall": recall,
"Precision": precision,
"F1": f1,
},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(lr, X_val,y_val)
lr_train_score=model_performance_classification_sklearn(lr, X_train, y_train)
print("Training performance \n",lr_train_score)
Training performance
Accuracy Recall Precision F1
0 0.884696 0.956842 0.909 0.932308
lr_val_score=model_performance_classification_sklearn(lr, X_val,y_val)
print("Validation performance \n",lr_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.875262 0.9482 0.905854 0.926543
dtree = DecisionTreeClassifier(criterion='gini',random_state=1)
dtree.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=dtree, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(dtree, X_val,y_val)
dtree_train_score=model_performance_classification_sklearn(dtree, X_train, y_train)
print("Training performance \n",dtree_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
dtree_val_score=model_performance_classification_sklearn(dtree, X_val,y_val)
print("Validation performance \n",dtree_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.922432 0.950095 0.956135 0.953105
bagging = BaggingClassifier(random_state=1)
bagging.fit(X_train,y_train)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=bagging, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(bagging, X_val,y_val)
bagging_train_score=model_performance_classification_sklearn(bagging, X_train, y_train)
print("Training performance \n",bagging_train_score)
Training performance
Accuracy Recall Precision F1
0 0.996855 0.996842 0.999367 0.998103
bagging_val_score=model_performance_classification_sklearn(bagging, X_val,y_val)
print("Validation performance \n",bagging_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.949161 0.969678 0.969066 0.969372
abc = AdaBoostClassifier(random_state=1)
abc.fit(X_train,y_train)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=abc, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(abc, X_val,y_val)
abc_train_score= model_performance_classification_sklearn(abc, X_train,y_train)
print("Training performance \n",abc_train_score)
Training performance
Accuracy Recall Precision F1
0 0.963312 0.982316 0.973706 0.977992
abc_val_score= model_performance_classification_sklearn(abc, X_val,y_val)
print("Validation performance \n",abc_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.953878 0.977258 0.96748 0.972344
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train,y_train)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=gbc, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(gbc, X_val,y_val)
gbc_train_score= model_performance_classification_sklearn(gbc, X_train,y_train)
print("Training performance \n",gbc_train_score)
Training performance
Accuracy Recall Precision F1
0 0.975192 0.990737 0.9796 0.985137
gbc_val_score= model_performance_classification_sklearn(gbc, X_val,y_val)
print("Validation performance \n",gbc_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.963312 0.988629 0.967842 0.978125
xgb = XGBClassifier(random_state=1,eval_metric='logloss')
xgb.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=xgb, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
confusion_matrix_sklearn(xgb, X_val,y_val)
xgb_train_score= model_performance_classification_sklearn(xgb, X_train,y_train)
print("Training performance \n",xgb_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
xgb_val_score= model_performance_classification_sklearn(xgb, X_val,y_val)
print("Validation performance \n",xgb_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.967505 0.987366 0.973832 0.980552
print("Before UpSampling, counts of label 'Existing Customer': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'Attrited Customer': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Existing Customer': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'Attrited Customer': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Existing Customer': 4750 Before UpSampling, counts of label 'Attrited Customer': 974 After UpSampling, counts of label 'Existing Customer': 4750 After UpSampling, counts of label 'Attrited Customer': 4750 After UpSampling, the shape of train_X: (9500, 34) After UpSampling, the shape of train_y: (9500,)
log_reg_over = LogisticRegression(random_state=1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over, y_train_over)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(log_reg_over, X_val, y_val)
lr_over_train_score = model_performance_classification_sklearn(log_reg_over, X_train_over, y_train_over)
print("Training performance \n",lr_over_train_score)
Training performance
Accuracy Recall Precision F1
0 0.851684 0.836842 0.862443 0.84945
lr_over_val_score = model_performance_classification_sklearn(log_reg_over, X_val, y_val)
print("Validation performance \n",lr_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.81761 0.818699 0.955048 0.881633
dtree_over = DecisionTreeClassifier(criterion='gini',random_state=1)
dtree_over.fit(X_train_over, y_train_over)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=dtree_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(dtree_over, X_val, y_val)
dtree_over_train_score = model_performance_classification_sklearn(dtree_over, X_train_over, y_train_over)
print("Training performance \n",dtree_over_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
dtree_over_val_score = model_performance_classification_sklearn(dtree_over, X_val, y_val)
print("Validation performance \n",dtree_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.913522 0.934934 0.959792 0.9472
bagging_over = BaggingClassifier(random_state=1)
bagging_over.fit(X_train_over,y_train_over)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=bagging_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(bagging_over, X_val, y_val)
bagging_over_train_score = model_performance_classification_sklearn(bagging_over, X_train_over, y_train_over)
print("Training performance \n",bagging_over_train_score)
Training performance
Accuracy Recall Precision F1
0 0.997263 0.995368 0.999155 0.997258
bagging_over_val_score = model_performance_classification_sklearn(bagging_over, X_val, y_val)
print("Validation performance \n",bagging_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.927149 0.939987 0.971279 0.955377
abc_over = AdaBoostClassifier(random_state=1)
abc_over.fit(X_train_over,y_train_over)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=abc_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(abc_over, X_val, y_val)
abc_over_train_score = model_performance_classification_sklearn(abc_over, X_train_over, y_train_over)
print("Training performance \n",abc_over_train_score)
Training performance
Accuracy Recall Precision F1
0 0.969684 0.968842 0.970477 0.969659
abc_over_val_score = model_performance_classification_sklearn(abc_over, X_val, y_val)
print("Validation performance \n",abc_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.941824 0.958307 0.971191 0.964706
gbc_over = GradientBoostingClassifier(random_state=1)
gbc_over.fit(X_train_over,y_train_over)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=gbc_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(gbc_over, X_val, y_val)
gbc_over_train_score = model_performance_classification_sklearn(gbc_over, X_train_over, y_train_over)
print("Training performance \n",gbc_over_train_score)
Training performance
Accuracy Recall Precision F1
0 0.983684 0.981895 0.985422 0.983655
gbc_over_val_score = model_performance_classification_sklearn(gbc_over, X_val, y_val)
print("Validation performance \n",gbc_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.955451 0.972836 0.973451 0.973144
xgb_over = XGBClassifier(random_state=1,eval_metric='logloss')
xgb_over.fit(X_train_over,y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=xgb_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
confusion_matrix_sklearn(xgb_over, X_val, y_val)
xgb_over_train_score = model_performance_classification_sklearn(xgb_over, X_train_over, y_train_over)
print("Training performance \n",xgb_over_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
xgb_over_val_score = model_performance_classification_sklearn(xgb_over, X_val, y_val)
print("Validation performance \n",xgb_over_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.962264 0.979154 0.975456 0.977301
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Existing Customer': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'Attriting Customer': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Existing Customer' {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'Attriting Customer': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Existing Customer': 4750 Before Under Sampling, counts of label 'Attriting Customer': 974 After Under Sampling, counts of label 'Existing Customer' 974 After Under Sampling, counts of label 'Attriting Customer': 974 After Under Sampling, the shape of train_X: (1948, 34) After Under Sampling, the shape of train_y: (1948,)
lr_un = LogisticRegression(random_state=1)
lr_un.fit(X_train_un,y_train_un)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=lr_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
lr_un_train_score = model_performance_classification_sklearn(lr_un, X_train_un, y_train_un)
print("Training performance \n",lr_un_train_score)
Training performance
Accuracy Recall Precision F1
0 0.836756 0.817248 0.850427 0.833508
lr_un_val_score = model_performance_classification_sklearn(lr_un, X_val, y_val)
print("Validation performance \n",lr_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.793501 0.784586 0.959073 0.863099
dtree_un = DecisionTreeClassifier(criterion = "gini", random_state = 1)
dtree_un.fit(X_train_un,y_train_un)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=dtree_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
dtree_un_train_score = model_performance_classification_sklearn(dtree_un, X_train_un, y_train_un)
print("Training performance \n",dtree_un_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
dtree_un_val_score = model_performance_classification_sklearn(dtree_un, X_val, y_val)
print("Validation performance \n",dtree_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.887841 0.885028 0.977669 0.929045
bagging_un = BaggingClassifier(random_state=1)
bagging_un.fit(X_train_un,y_train_un)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=bagging_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
bagging_un_train_score = model_performance_classification_sklearn(bagging_un, X_train_un, y_train_un)
print("Training performance \n",bagging_un_train_score)
Training performance
Accuracy Recall Precision F1
0 0.99538 0.991786 0.998966 0.995363
bagging_un_val_score = model_performance_classification_sklearn(bagging_un, X_val, y_val)
print("Validation performance \n",bagging_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.895178 0.884397 0.988003 0.933333
abc_un = AdaBoostClassifier(random_state=1)
abc_un.fit(X_train_un,y_train_un)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=abc_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
abc_un_train_score = model_performance_classification_sklearn(abc_un, X_train_un, y_train_un)
print("Training performance \n",abc_un_train_score)
Training performance
Accuracy Recall Precision F1
0 0.958932 0.953799 0.963693 0.95872
abc_un_val_score = model_performance_classification_sklearn(abc_un, X_val, y_val)
print("Validation performance \n",abc_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.916667 0.914087 0.984354 0.94792
gbc_un = GradientBoostingClassifier(random_state=1)
gbc_un.fit(X_train_un,y_train_un)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=gbc_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
gbc_un_train_score = model_performance_classification_sklearn(gbc_un, X_train_un, y_train_un)
print("Training performance \n",gbc_un_train_score)
Training performance
Accuracy Recall Precision F1
0 0.98306 0.972279 0.993704 0.982875
gbc_un_val_score = model_performance_classification_sklearn(gbc_un, X_val, y_val)
print("Validation performance \n",gbc_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.922432 0.919141 0.986441 0.951602
xgb_un = XGBClassifier(random_state=1,eval_metric='logloss')
xgb_un.fit(X_train_un,y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=xgb_un, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
xgb_un_train_score = model_performance_classification_sklearn(xgb_un, X_train_un, y_train_un)
print("Training performance \n",xgb_un_train_score)
Training performance
Accuracy Recall Precision F1
0 1.0 1.0 1.0 1.0
xgb_un_val_score = model_performance_classification_sklearn(xgb_un, X_val, y_val)
print("Validation performance \n",xgb_un_val_score)
Validation performance
Accuracy Recall Precision F1
0 0.932914 0.930512 0.987928 0.95836
# training performance comparison
models_train_comp_df = pd.concat(
[lr_train_score.T,lr_over_train_score.T,lr_un_train_score.T,dtree_train_score.T,dtree_over_train_score.T,dtree_un_train_score.T,
bagging_train_score.T,bagging_over_train_score.T,bagging_un_train_score.T,abc_train_score.T,abc_over_train_score.T,abc_un_train_score.T,
gbc_train_score.T,gbc_over_train_score.T,gbc_un_train_score.T,xgb_train_score.T,xgb_over_train_score.T,xgb_un_train_score.T],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression",
"Logistic Regression Oversample",
"Logistic Regression Undersample",
"Decision Tree Classifier",
"Decision Tree Oversample",
"Decision Tree Undersample",
"Bagging Classifier",
"Bagging Oversample",
"Bagging Undersample",
"AdaBoost Classifier",
"AdaBoost Oversample",
"AdaBoost Undersample",
"Gradient Boost Classifier",
"Gradient Boost Oversample",
"Gradient Boost Undersample",
"XGB Classifier",
"XGB Oversample",
"XGB Undersample"]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression | Logistic Regression Oversample | Logistic Regression Undersample | Decision Tree Classifier | Decision Tree Oversample | Decision Tree Undersample | Bagging Classifier | Bagging Oversample | Bagging Undersample | AdaBoost Classifier | AdaBoost Oversample | AdaBoost Undersample | Gradient Boost Classifier | Gradient Boost Oversample | Gradient Boost Undersample | XGB Classifier | XGB Oversample | XGB Undersample | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.884696 | 0.851684 | 0.836756 | 1.0 | 1.0 | 1.0 | 0.996855 | 0.997263 | 0.995380 | 0.963312 | 0.969684 | 0.958932 | 0.975192 | 0.983684 | 0.983060 | 1.0 | 1.0 | 1.0 |
| Recall | 0.956842 | 0.836842 | 0.817248 | 1.0 | 1.0 | 1.0 | 0.996842 | 0.995368 | 0.991786 | 0.982316 | 0.968842 | 0.953799 | 0.990737 | 0.981895 | 0.972279 | 1.0 | 1.0 | 1.0 |
| Precision | 0.909000 | 0.862443 | 0.850427 | 1.0 | 1.0 | 1.0 | 0.999367 | 0.999155 | 0.998966 | 0.973706 | 0.970477 | 0.963693 | 0.979600 | 0.985422 | 0.993704 | 1.0 | 1.0 | 1.0 |
| F1 | 0.932308 | 0.849450 | 0.833508 | 1.0 | 1.0 | 1.0 | 0.998103 | 0.997258 | 0.995363 | 0.977992 | 0.969659 | 0.958720 | 0.985137 | 0.983655 | 0.982875 | 1.0 | 1.0 | 1.0 |
# validation performance comparison
models_val_comp_df = pd.concat(
[lr_val_score.T,lr_over_val_score.T,lr_un_val_score.T,dtree_val_score.T,dtree_over_val_score.T,dtree_un_val_score.T,
bagging_val_score.T,bagging_over_val_score.T,bagging_un_val_score.T,abc_val_score.T,abc_over_val_score.T,abc_un_val_score.T,
gbc_val_score.T,gbc_over_val_score.T,gbc_un_val_score.T,xgb_val_score.T,xgb_over_val_score.T,xgb_un_val_score.T],
axis=1,
)
models_val_comp_df.columns = [
"Logistic Regression",
"Logistic Regression Oversample",
"Logistic Regression Undersample",
"Decision Tree Classifier",
"Decision Tree Oversample",
"Decision Tree Undersample",
"Bagging Classifier",
"Bagging Oversample",
"Bagging Undersample",
"AdaBoost Classifier",
"AdaBoost Oversample",
"AdaBoost Undersample",
"Gradient Boost Classifier",
"Gradient Boost Oversample",
"Gradient Boost Undersample",
"XGB Classifier",
"XGB Oversample",
"XGB Undersample"]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| Logistic Regression | Logistic Regression Oversample | Logistic Regression Undersample | Decision Tree Classifier | Decision Tree Oversample | Decision Tree Undersample | Bagging Classifier | Bagging Oversample | Bagging Undersample | AdaBoost Classifier | AdaBoost Oversample | AdaBoost Undersample | Gradient Boost Classifier | Gradient Boost Oversample | Gradient Boost Undersample | XGB Classifier | XGB Oversample | XGB Undersample | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Accuracy | 0.875262 | 0.817610 | 0.793501 | 0.922432 | 0.913522 | 0.887841 | 0.949161 | 0.927149 | 0.895178 | 0.953878 | 0.941824 | 0.916667 | 0.963312 | 0.955451 | 0.922432 | 0.967505 | 0.962264 | 0.932914 |
| Recall | 0.948200 | 0.818699 | 0.784586 | 0.950095 | 0.934934 | 0.885028 | 0.969678 | 0.939987 | 0.884397 | 0.977258 | 0.958307 | 0.914087 | 0.988629 | 0.972836 | 0.919141 | 0.987366 | 0.979154 | 0.930512 |
| Precision | 0.905854 | 0.955048 | 0.959073 | 0.956135 | 0.959792 | 0.977669 | 0.969066 | 0.971279 | 0.988003 | 0.967480 | 0.971191 | 0.984354 | 0.967842 | 0.973451 | 0.986441 | 0.973832 | 0.975456 | 0.987928 |
| F1 | 0.926543 | 0.881633 | 0.863099 | 0.953105 | 0.947200 | 0.929045 | 0.969372 | 0.955377 | 0.933333 | 0.972344 | 0.964706 | 0.947920 | 0.978125 | 0.973144 | 0.951602 | 0.980552 | 0.977301 | 0.958360 |
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSearchCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_))
Best Parameters:{'base_estimator': DecisionTreeClassifier(max_depth=1, random_state=1), 'learning_rate': 0.1, 'n_estimators': 10}
Score: 1.0
Wall time: 55.2 s
# Building model with best parameters
abc_tuned1 = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1, random_state=1),
learning_rate = 0.1,
n_estimators = 10,
random_state=1
)
abc_tuned1.fit(X_train,y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.1, n_estimators=10, random_state=1)
abc_tuned1_train_score = model_performance_classification_sklearn(
abc_tuned1, X_train, y_train
)
print("Training performance:")
abc_tuned1_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829839 | 1.0 | 0.829839 | 0.907008 |
# Calculating different metrics on validation set
abc_tuned1_val_score = model_performance_classification_sklearn(abc_tuned1, X_val, y_val)
print("Validation performance:")
abc_tuned1_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829665 | 1.0 | 0.829665 | 0.906903 |
# creating confusion matrix
confusion_matrix_sklearn(abc_tuned1, X_val, y_val)
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 50, 'learning_rate': 0.01, 'base_estimator': DecisionTreeClassifier(max_depth=1, random_state=1)} with CV score=1.0:
Wall time: 17.8 s
# Building model with best parameters
abc_tuned2 = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1, random_state=1),
learning_rate = 0.01,
n_estimators = 50,
random_state=1,
)
abc_tuned2.fit(X_train,y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1,
random_state=1),
learning_rate=0.01, random_state=1)
abc_tuned2_train_score = model_performance_classification_sklearn(
abc_tuned2, X_train, y_train
)
print("Training performance:")
abc_tuned2_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829839 | 1.0 | 0.829839 | 0.907008 |
# Calculating different metrics on validation set
abc_tuned2_val_score = model_performance_classification_sklearn(abc_tuned1, X_val, y_val)
print("Validation performance:")
abc_tuned2_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829665 | 1.0 | 0.829665 | 0.906903 |
# creating confusion matrix
confusion_matrix_sklearn(abc_tuned2, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(50,250,50),
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSearchCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_))
Best Parameters:{'max_features': 1, 'n_estimators': 50, 'subsample': 1}
Score: 0.9936842105263158
Wall time: 23.2 s
# Building model with best parameters
gbc_tuned1 = GradientBoostingClassifier(
random_state=1,
max_features = 1,
subsample = 1,
n_estimators = 50
)
gbc_tuned1.fit(X_train,y_train)
GradientBoostingClassifier(max_features=1, n_estimators=50, random_state=1,
subsample=1)
gbc_tuned1_train_score = model_performance_classification_sklearn(
gbc_tuned1, X_train, y_train
)
print("Training performance:")
gbc_tuned1_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.895528 | 0.994526 | 0.891994 | 0.940474 |
# Calculating different metrics on validation set
gbc_tuned1_val_score = model_performance_classification_sklearn(gbc_tuned1, X_val, y_val)
print("Validation performance:")
gbc_tuned1_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.891509 | 0.998105 | 0.88565 | 0.938521 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned1, X_val, y_val)
%%time
# defining model
model = GradientBoostingClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(50,250,50),
"subsample":[0.8,0.9,1],
"max_features":[0.7,0.8,0.9,1]
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'subsample': 1, 'n_estimators': 50, 'max_features': 1} with CV score=0.9936842105263158:
Wall time: 23.2 s
gbc_tuned2 = GradientBoostingClassifier(
random_state=1,
max_features = 1,
subsample = 1,
n_estimators = 50
)
gbc_tuned2.fit(X_train,y_train)
GradientBoostingClassifier(max_features=1, n_estimators=50, random_state=1,
subsample=1)
gbc_tuned2_train_score = model_performance_classification_sklearn(
gbc_tuned2, X_train, y_train
)
print("Training performance:")
gbc_tuned2_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.895528 | 0.994526 | 0.891994 | 0.940474 |
# Calculating different metrics on validation set
gbc_tuned2_val_score = model_performance_classification_sklearn(gbc_tuned2, X_val, y_val)
print("Validation performance:")
gbc_tuned2_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.891509 | 0.998105 | 0.88565 | 0.938521 |
# creating confusion matrix
confusion_matrix_sklearn(gbc_tuned2, X_val, y_val)
%%time
#defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
#Fitting parameters in GridSearchCV
grid_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best parameters are {'gamma': 0, 'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50, 'reg_lambda': 5, 'scale_pos_weight': 2, 'subsample': 0.8} with CV score=1.0:
Wall time: 11min 30s
# Building model with best parameters
xgb_tuned1 = XGBClassifier(
random_state=1,
eval_metric='logloss',
gamma = 0,
learning_rate = 0.01,
max_depth =1,
n_estimators = 50,
reg_lambda = 5,
scale_pos_weight = 2,
subsample = 0.8
)
xgb_tuned1.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=5,
scale_pos_weight=2, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
xgb_tuned1_train_score = model_performance_classification_sklearn(
xgb_tuned1, X_train, y_train
)
print("Training performance:")
xgb_tuned1_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829839 | 1.0 | 0.829839 | 0.907008 |
# Calculating different metrics on validation set
xgb_tuned1_val_score = model_performance_classification_sklearn(xgb_tuned1, X_val, y_val)
print("Validation performance:")
xgb_tuned1_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829665 | 1.0 | 0.829665 | 0.906903 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned1, X_val, y_val)
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 2, 'reg_lambda': 10, 'n_estimators': 100, 'max_depth': 1, 'learning_rate': 0.01, 'gamma': 5} with CV score=1.0:
Wall time: 14.5 s
# Building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
eval_metric='logloss',
gamma = 5,
learning_rate = 0.01,
max_depth =1,
n_estimators = 100,
reg_lambda = 10,
scale_pos_weight = 2,
subsample = 0.8
)
xgb_tuned2.fit(X_train,y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.01, max_delta_step=0,
max_depth=1, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=12,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=2, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
xgb_tuned2_train_score = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgb_tuned2_train_score
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829839 | 1.0 | 0.829839 | 0.907008 |
# Calculating different metrics on validation set
xgb_tuned2_val_score = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgb_tuned2_val_score
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.829665 | 1.0 | 0.829665 | 0.906903 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
abc_tuned1_train_score.T,
abc_tuned2_train_score.T,
gbc_tuned1_train_score.T,
gbc_tuned2_train_score.T,
xgb_tuned1_train_score.T,
xgb_tuned2_train_score.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"GradientBoost Tuned with Grid search",
"GradientBoost Tuned with Random Search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | GradientBoost Tuned with Grid search | GradientBoost Tuned with Random Search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 0.829839 | 0.829839 | 0.895528 | 0.895528 | 0.829839 | 0.829839 |
| Recall | 1.000000 | 1.000000 | 0.994526 | 0.994526 | 1.000000 | 1.000000 |
| Precision | 0.829839 | 0.829839 | 0.891994 | 0.891994 | 0.829839 | 0.829839 |
| F1 | 0.907008 | 0.907008 | 0.940474 | 0.940474 | 0.907008 | 0.907008 |
# Validation performance comparison
models_val_comp_df = pd.concat(
[
abc_tuned1_val_score.T,
abc_tuned2_val_score.T,
gbc_tuned1_val_score.T,
gbc_tuned2_val_score.T,
xgb_tuned1_val_score.T,
xgb_tuned2_val_score.T,
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"GradientBoost Tuned with Grid search",
"GradientBoost Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | GradientBoost Tuned with Grid search | GradientBoost Tuned with Random search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 0.829665 | 0.829665 | 0.891509 | 0.891509 | 0.829665 | 0.829665 |
| Recall | 1.000000 | 1.000000 | 0.998105 | 0.998105 | 1.000000 | 1.000000 |
| Precision | 0.829665 | 0.829665 | 0.885650 | 0.885650 | 0.829665 | 0.829665 |
| F1 | 0.906903 | 0.906903 | 0.938521 | 0.938521 | 0.906903 | 0.906903 |
# Calculating different metrics on validation set
gbc_tuned1_test_score = model_performance_classification_sklearn(gbc_tuned1, X_test, y_test)
print("Testing performance:")
gbc_tuned1_test_score
Testing performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.889413 | 0.994315 | 0.886261 | 0.937184 |
feature_names = X.columns
importances = gbc_tuned1.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
numerical_features = [
"Customer_Age",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio",
]
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
categorical_features = [
"Gender",
"Dependent_count",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder = "drop",
)
# Separating target variable and other variables
X = bank_data.drop("Attrition_Flag", axis=1)
Y = bank_data["Attrition_Flag"]
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Customer_Age 10127 non-null int64 2 Gender 10127 non-null object 3 Dependent_count 10127 non-null int64 4 Education_Level 8608 non-null object 5 Marital_Status 9378 non-null object 6 Income_Category 10127 non-null object 7 Card_Category 10127 non-null object 8 Months_on_book 10127 non-null int64 9 Total_Relationship_Count 10127 non-null int64 10 Months_Inactive_12_mon 10127 non-null int64 11 Contacts_Count_12_mon 10127 non-null int64 12 Credit_Limit 10127 non-null float64 13 Total_Revolving_Bal 10127 non-null int64 14 Avg_Open_To_Buy 10127 non-null float64 15 Total_Amt_Chng_Q4_Q1 10127 non-null float64 16 Total_Trans_Amt 10127 non-null int64 17 Total_Trans_Ct 10127 non-null int64 18 Total_Ct_Chng_Q4_Q1 10127 non-null float64 19 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(5) memory usage: 1.5+ MB
#basic preprocessing
X.drop(["CLIENTNUM"],axis=1,inplace=True)
X["Income_Category"].replace("abc",np.nan,inplace=True)
#Encoding Existing and attrited customers to 1 and 0 for analysis
Y.replace("Existing Customer",0,inplace=True)
Y.replace("Attrited Customer",1,inplace=True)
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 19) (3039, 19)
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"GBM",
GradientBoostingClassifier(
random_state=1,
max_features = 1,
subsample = 1,
n_estimators = 50,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Months_on_book',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_Amt_Chng_Q4_Q1',
'Total_Trans_Amt',
'Total_Trans_Ct',
'To...
'Avg_Utilization_Ratio']),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
['Gender', 'Dependent_count',
'Education_Level',
'Marital_Status',
'Income_Category',
'Card_Category'])])),
('GBM',
GradientBoostingClassifier(max_features=1, n_estimators=50,
random_state=1, subsample=1))])